#importing libraries to retrieve data and read csv and to perform EDa tasks
import pandas as pd
import numpy as np
#importing for visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# data preprocessing
from sklearn.preprocessing import StandardScaler
# data splitting
from sklearn.model_selection import train_test_split
from collections import Counter
# data modeling
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
## Read the CSV File Using Pandas read_csv function
data_set=pd.read_csv("C:\\Users\\sumas\\Documents\\Data science\\o8t_test data set\\o8t_testdata.csv")
data_set.columns
Index(['Subject_ID', 'Label', 'Gender', 'Age', 'Education', 'Height', 'Weight',
'History of cerebrovascular disease', 'History of hypertension',
'History of diabetes', 'History of coronary heart disease',
'History of hyperlipidemia', 'History of anemia',
'History of CO poisoning',
'History of general anesthesia during surgery',
'History of abnormal thyroid function',
'History of traumatic brain injury', 'Family history of dementia',
'Smoking history', 'Drinking history', 'Unnamed: 21', 'NPI', 'MoCAB',
'MMSE', 'IADL', 'HAMA', 'HAMD', 'C1 HVLT(immediate memory)',
'C5 HVLT(delayed recall 5min)', 'C8 HVLT(delayed recall 20min)',
'C4 logical memory(WMS)', 'C6 Boston Naming Test',
'C3 articulateness and verbal fluencey-vegetable (BNT)', 'C7-STT_A',
'C7-STT_B', 'C2 CFT Rey-O(imitation)', 'C9 CFT Rey-O(recall)',
'HD1 depressive mood', 'HD2 guilty', 'HD3 suicidal',
'HD7 work & interests', 'HA6 Total Score of Depressive Mood \n',
'Total score of Depression core factors', 'HA1 Anxiety', 'HA2 Tension',
'HA3 Fear', 'HA14 Interview perfomance',
'Total score of Anxiety factors'],
dtype='object')
data_set.head(5)
| Subject_ID | Label | Gender | Age | Education | Height | Weight | History of cerebrovascular disease | History of hypertension | History of diabetes | ... | HD2 guilty | HD3 suicidal | HD7 work & interests | HA6 Total Score of Depressive Mood \n | Total score of Depression core factors | HA1 Anxiety | HA2 Tension | HA3 Fear | HA14 Interview perfomance | Total score of Anxiety factors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | MCI | Male | 77 | 165.0 | 90.0 | 9 | Yes | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 2 | Dementia | Male | 81 | 169.0 | 70.0 | 0 | Yes | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 2 | 3 | Normal | Female | 77 | 155.0 | 55.0 | 12 | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 |
| 3 | 4 | Normal | Male | 75 | 169.0 | 75.0 | 9 | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 5 | Normal | Male | 68 | NaN | NaN | 9 | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 48 columns
data_set.dtypes
Subject_ID int64 Label object Gender object Age int64 Education float64 Height float64 Weight int64 History of cerebrovascular disease object History of hypertension object History of diabetes object History of coronary heart disease object History of hyperlipidemia object History of anemia object History of CO poisoning object History of general anesthesia during surgery object History of abnormal thyroid function object History of traumatic brain injury object Family history of dementia object Smoking history object Drinking history object Unnamed: 21 float64 NPI float64 MoCAB float64 MMSE float64 IADL float64 HAMA float64 HAMD float64 C1 HVLT(immediate memory) float64 C5 HVLT(delayed recall 5min) float64 C8 HVLT(delayed recall 20min) float64 C4 logical memory(WMS) float64 C6 Boston Naming Test float64 C3 articulateness and verbal fluencey-vegetable (BNT) float64 C7-STT_A object C7-STT_B object C2 CFT Rey-O(imitation) float64 C9 CFT Rey-O(recall) float64 HD1 depressive mood float64 HD2 guilty float64 HD3 suicidal float64 HD7 work & interests float64 HA6 Total Score of Depressive Mood \n float64 Total score of Depression core factors float64 HA1 Anxiety float64 HA2 Tension float64 HA3 Fear float64 HA14 Interview perfomance float64 Total score of Anxiety factors float64 dtype: object
### summary of given data set ###
data_set.describe()
| Subject_ID | Age | Education | Height | Weight | Unnamed: 21 | NPI | MoCAB | MMSE | IADL | ... | HD2 guilty | HD3 suicidal | HD7 work & interests | HA6 Total Score of Depressive Mood \n | Total score of Depression core factors | HA1 Anxiety | HA2 Tension | HA3 Fear | HA14 Interview perfomance | Total score of Anxiety factors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 217.000000 | 217.000000 | 215.000000 | 215.000000 | 217.000000 | 0.0 | 215.000000 | 215.000000 | 215.000000 | 215.000000 | ... | 216.000000 | 216.000000 | 216.000000 | 216.000000 | 216.000000 | 215.000000 | 215.000000 | 215.000000 | 215.000000 | 215.000000 |
| mean | 109.000000 | 71.589862 | 162.423256 | 62.200000 | 10.451613 | NaN | 2.883721 | 16.944186 | 22.627907 | 17.972093 | ... | 0.277778 | 0.092593 | 0.379630 | 0.648148 | 1.879630 | 0.748837 | 0.530233 | 0.148837 | 0.120930 | 1.548837 |
| std | 62.786676 | 7.582417 | 7.912631 | 10.014685 | 4.500896 | NaN | 4.836627 | 7.385680 | 6.511755 | 6.044165 | ... | 0.679717 | 0.348740 | 0.762097 | 0.833204 | 2.657915 | 1.005678 | 0.874225 | 0.479667 | 0.379718 | 1.961352 |
| min | 1.000000 | 50.000000 | 138.000000 | 41.000000 | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 14.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 55.000000 | 66.000000 | 157.000000 | 55.000000 | 9.000000 | NaN | 0.000000 | 11.000000 | 20.000000 | 14.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 109.000000 | 72.000000 | 162.000000 | 60.000000 | 9.000000 | NaN | 1.000000 | 18.000000 | 25.000000 | 16.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 75% | 163.000000 | 77.000000 | 169.000000 | 69.000000 | 14.000000 | NaN | 3.000000 | 23.000000 | 27.000000 | 19.500000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.000000 | 1.500000 | 1.000000 | 0.000000 | 0.000000 | 2.000000 |
| max | 217.000000 | 92.000000 | 182.000000 | 92.000000 | 20.000000 | NaN | 27.000000 | 29.000000 | 30.000000 | 54.000000 | ... | 3.000000 | 2.000000 | 3.000000 | 3.000000 | 13.000000 | 3.000000 | 3.000000 | 2.000000 | 2.000000 | 8.000000 |
8 rows × 31 columns
data_set.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 48 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Subject_ID 217 non-null int64
1 Label 212 non-null object
2 Gender 217 non-null object
3 Age 217 non-null int64
4 Education 215 non-null float64
5 Height 215 non-null float64
6 Weight 217 non-null int64
7 History of cerebrovascular disease 217 non-null object
8 History of hypertension 217 non-null object
9 History of diabetes 217 non-null object
10 History of coronary heart disease 217 non-null object
11 History of hyperlipidemia 217 non-null object
12 History of anemia 217 non-null object
13 History of CO poisoning 217 non-null object
14 History of general anesthesia during surgery 217 non-null object
15 History of abnormal thyroid function 217 non-null object
16 History of traumatic brain injury 217 non-null object
17 Family history of dementia 217 non-null object
18 Smoking history 217 non-null object
19 Drinking history 217 non-null object
20 Unnamed: 21 0 non-null float64
21 NPI 215 non-null float64
22 MoCAB 215 non-null float64
23 MMSE 215 non-null float64
24 IADL 215 non-null float64
25 HAMA 215 non-null float64
26 HAMD 216 non-null float64
27 C1 HVLT(immediate memory) 213 non-null float64
28 C5 HVLT(delayed recall 5min) 213 non-null float64
29 C8 HVLT(delayed recall 20min) 213 non-null float64
30 C4 logical memory(WMS) 213 non-null float64
31 C6 Boston Naming Test 213 non-null float64
32 C3 articulateness and verbal fluencey-vegetable (BNT) 213 non-null float64
33 C7-STT_A 213 non-null object
34 C7-STT_B 213 non-null object
35 C2 CFT Rey-O(imitation) 213 non-null float64
36 C9 CFT Rey-O(recall) 213 non-null float64
37 HD1 depressive mood 216 non-null float64
38 HD2 guilty 216 non-null float64
39 HD3 suicidal 216 non-null float64
40 HD7 work & interests 216 non-null float64
41 HA6 Total Score of Depressive Mood
216 non-null float64
42 Total score of Depression core factors 216 non-null float64
43 HA1 Anxiety 215 non-null float64
44 HA2 Tension 215 non-null float64
45 HA3 Fear 215 non-null float64
46 HA14 Interview perfomance 215 non-null float64
47 Total score of Anxiety factors 215 non-null float64
dtypes: float64(28), int64(3), object(17)
memory usage: 81.5+ KB
#since the dataset contain null values ###
#count total rows in each column which contain null values ####
data_set.isna().sum()
Subject_ID 0 Label 5 Gender 0 Age 0 Education 2 Height 2 Weight 0 History of cerebrovascular disease 0 History of hypertension 0 History of diabetes 0 History of coronary heart disease 0 History of hyperlipidemia 0 History of anemia 0 History of CO poisoning 0 History of general anesthesia during surgery 0 History of abnormal thyroid function 0 History of traumatic brain injury 0 Family history of dementia 0 Smoking history 0 Drinking history 0 Unnamed: 21 217 NPI 2 MoCAB 2 MMSE 2 IADL 2 HAMA 2 HAMD 1 C1 HVLT(immediate memory) 4 C5 HVLT(delayed recall 5min) 4 C8 HVLT(delayed recall 20min) 4 C4 logical memory(WMS) 4 C6 Boston Naming Test 4 C3 articulateness and verbal fluencey-vegetable (BNT) 4 C7-STT_A 4 C7-STT_B 4 C2 CFT Rey-O(imitation) 4 C9 CFT Rey-O(recall) 4 HD1 depressive mood 1 HD2 guilty 1 HD3 suicidal 1 HD7 work & interests 1 HA6 Total Score of Depressive Mood \n 1 Total score of Depression core factors 1 HA1 Anxiety 2 HA2 Tension 2 HA3 Fear 2 HA14 Interview perfomance 2 Total score of Anxiety factors 2 dtype: int64
#'duplicated()' function in pandas return the duplicate row as True and other as False
#for counting the duplicate elements we sum all the rows
sum(data_set.duplicated())
0
### removing undefined columns ####
data_set.drop(columns=["Unnamed: 21"],inplace=True)
data_set
| Subject_ID | Label | Gender | Age | Education | Height | Weight | History of cerebrovascular disease | History of hypertension | History of diabetes | ... | HD2 guilty | HD3 suicidal | HD7 work & interests | HA6 Total Score of Depressive Mood \n | Total score of Depression core factors | HA1 Anxiety | HA2 Tension | HA3 Fear | HA14 Interview perfomance | Total score of Anxiety factors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | MCI | Male | 77 | 165.0 | 90.0 | 9 | Yes | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 2 | Dementia | Male | 81 | 169.0 | 70.0 | 0 | Yes | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 2 | 3 | Normal | Female | 77 | 155.0 | 55.0 | 12 | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 |
| 3 | 4 | Normal | Male | 75 | 169.0 | 75.0 | 9 | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 5 | Normal | Male | 68 | NaN | NaN | 9 | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 212 | 213 | MCI | Male | 70 | 170.0 | 85.0 | 12 | Yes | Yes | Yes | ... | 1.0 | 0.0 | 2.0 | 2.0 | 8.0 | 2.0 | 2.0 | 0.0 | 0.0 | 4.0 |
| 213 | 214 | NaN | Male | 76 | 172.0 | 73.0 | 12 | No | No | No | ... | 1.0 | 2.0 | 2.0 | 2.0 | 9.0 | 2.0 | 1.0 | 1.0 | 1.0 | 5.0 |
| 214 | 215 | NaN | Female | 78 | 158.0 | 58.0 | 6 | No | No | Yes | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 215 | 216 | NaN | Male | 66 | 172.0 | 77.0 | 12 | Yes | Yes | No | ... | 2.0 | 0.0 | 1.0 | 1.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 216 | 217 | NaN | Female | 61 | 169.0 | 78.0 | 12 | No | Yes | No | ... | 0.0 | 0.0 | 1.0 | 2.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
217 rows × 47 columns
data_set["Label"].unique()
array(['MCI', 'Dementia', 'Normal', 'MCI?Dementia', nan, 'MCI,naMCI_MD'],
dtype=object)
### dropping dependent variable rows which are Null, not catergorised###
data_set.drop(labels=[185,209],axis=0,inplace=True)
data_set.dropna(subset=["Label"],inplace=True)
data_set.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 0 to 212
Data columns (total 47 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Subject_ID 210 non-null int64
1 Label 210 non-null object
2 Gender 210 non-null object
3 Age 210 non-null int64
4 Education 208 non-null float64
5 Height 208 non-null float64
6 Weight 210 non-null int64
7 History of cerebrovascular disease 210 non-null object
8 History of hypertension 210 non-null object
9 History of diabetes 210 non-null object
10 History of coronary heart disease 210 non-null object
11 History of hyperlipidemia 210 non-null object
12 History of anemia 210 non-null object
13 History of CO poisoning 210 non-null object
14 History of general anesthesia during surgery 210 non-null object
15 History of abnormal thyroid function 210 non-null object
16 History of traumatic brain injury 210 non-null object
17 Family history of dementia 210 non-null object
18 Smoking history 210 non-null object
19 Drinking history 210 non-null object
20 NPI 208 non-null float64
21 MoCAB 208 non-null float64
22 MMSE 208 non-null float64
23 IADL 208 non-null float64
24 HAMA 208 non-null float64
25 HAMD 209 non-null float64
26 C1 HVLT(immediate memory) 206 non-null float64
27 C5 HVLT(delayed recall 5min) 206 non-null float64
28 C8 HVLT(delayed recall 20min) 206 non-null float64
29 C4 logical memory(WMS) 206 non-null float64
30 C6 Boston Naming Test 206 non-null float64
31 C3 articulateness and verbal fluencey-vegetable (BNT) 206 non-null float64
32 C7-STT_A 206 non-null object
33 C7-STT_B 206 non-null object
34 C2 CFT Rey-O(imitation) 206 non-null float64
35 C9 CFT Rey-O(recall) 206 non-null float64
36 HD1 depressive mood 209 non-null float64
37 HD2 guilty 209 non-null float64
38 HD3 suicidal 209 non-null float64
39 HD7 work & interests 209 non-null float64
40 HA6 Total Score of Depressive Mood
209 non-null float64
41 Total score of Depression core factors 209 non-null float64
42 HA1 Anxiety 208 non-null float64
43 HA2 Tension 208 non-null float64
44 HA3 Fear 208 non-null float64
45 HA14 Interview perfomance 208 non-null float64
46 Total score of Anxiety factors 208 non-null float64
dtypes: float64(27), int64(3), object(17)
memory usage: 78.8+ KB
##converting to categorical data
for columns in ["Label","Gender",'History of cerebrovascular disease', 'History of hypertension',
'History of diabetes', 'History of coronary heart disease',
'History of hyperlipidemia', 'History of anemia',
'History of CO poisoning',
'History of general anesthesia during surgery',
'History of abnormal thyroid function',
'History of traumatic brain injury', 'Family history of dementia',
'Smoking history', 'Drinking history']:
if(data_set[columns].dtype == 'object'):
data_set[columns]= data_set[columns].astype('category')
data_set[columns] = data_set[columns].cat.codes
data_set.head(5)
| Subject_ID | Label | Gender | Age | Education | Height | Weight | History of cerebrovascular disease | History of hypertension | History of diabetes | ... | HD2 guilty | HD3 suicidal | HD7 work & interests | HA6 Total Score of Depressive Mood \n | Total score of Depression core factors | HA1 Anxiety | HA2 Tension | HA3 Fear | HA14 Interview perfomance | Total score of Anxiety factors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 77 | 165.0 | 90.0 | 9 | 1 | 1 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 2 | 0 | 1 | 81 | 169.0 | 70.0 | 0 | 1 | 1 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 2 | 3 | 2 | 0 | 77 | 155.0 | 55.0 | 12 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 |
| 3 | 4 | 2 | 1 | 75 | 169.0 | 75.0 | 9 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 5 | 2 | 1 | 68 | NaN | NaN | 9 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 47 columns
### deleting columns "Education", "Height", "Weight" as they not important to diagnosis of the given disease ####
data_set.drop(axis=1,columns=["Education", "Height", "Weight"],inplace=True)
data_set.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 0 to 212
Data columns (total 44 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Subject_ID 210 non-null int64
1 Label 210 non-null int8
2 Gender 210 non-null int8
3 Age 210 non-null int64
4 History of cerebrovascular disease 210 non-null int8
5 History of hypertension 210 non-null int8
6 History of diabetes 210 non-null int8
7 History of coronary heart disease 210 non-null int8
8 History of hyperlipidemia 210 non-null int8
9 History of anemia 210 non-null int8
10 History of CO poisoning 210 non-null int8
11 History of general anesthesia during surgery 210 non-null int8
12 History of abnormal thyroid function 210 non-null int8
13 History of traumatic brain injury 210 non-null int8
14 Family history of dementia 210 non-null int8
15 Smoking history 210 non-null int8
16 Drinking history 210 non-null int8
17 NPI 208 non-null float64
18 MoCAB 208 non-null float64
19 MMSE 208 non-null float64
20 IADL 208 non-null float64
21 HAMA 208 non-null float64
22 HAMD 209 non-null float64
23 C1 HVLT(immediate memory) 206 non-null float64
24 C5 HVLT(delayed recall 5min) 206 non-null float64
25 C8 HVLT(delayed recall 20min) 206 non-null float64
26 C4 logical memory(WMS) 206 non-null float64
27 C6 Boston Naming Test 206 non-null float64
28 C3 articulateness and verbal fluencey-vegetable (BNT) 206 non-null float64
29 C7-STT_A 206 non-null object
30 C7-STT_B 206 non-null object
31 C2 CFT Rey-O(imitation) 206 non-null float64
32 C9 CFT Rey-O(recall) 206 non-null float64
33 HD1 depressive mood 209 non-null float64
34 HD2 guilty 209 non-null float64
35 HD3 suicidal 209 non-null float64
36 HD7 work & interests 209 non-null float64
37 HA6 Total Score of Depressive Mood
209 non-null float64
38 Total score of Depression core factors 209 non-null float64
39 HA1 Anxiety 208 non-null float64
40 HA2 Tension 208 non-null float64
41 HA3 Fear 208 non-null float64
42 HA14 Interview perfomance 208 non-null float64
43 Total score of Anxiety factors 208 non-null float64
dtypes: float64(25), int64(2), int8(15), object(2)
memory usage: 52.3+ KB
## retaining only total depression core factors scores and Total score of Anxiety factors, deleting indiviual score columns ####
data_set.drop(axis = 1,columns=["HD1 depressive mood", "HD2 guilty", 'HD3 suicidal','HD7 work & interests', 'HA6 Total Score of Depressive Mood \n'], inplace = True)
data_set.drop( axis = 1,columns=['HA1 Anxiety', 'HA2 Tension','HA3 Fear', 'HA14 Interview perfomance'], inplace = True)
## Handelling missing values ###
data_set.fillna(0,inplace=True)
## replacing "unable to compelete" cells with
data_set["C7-STT_A"].replace("Unable to complete",0,inplace=True)
data_set["C7-STT_B"].replace("Unable to complete",0,inplace=True)
##converting dtype
for columns in ["C7-STT_A","C7-STT_B"]:data_set[columns]= data_set[columns].astype('float64')
from pandas_profiling import ProfileReport
profile = ProfileReport(data_set)
profile